#### #### #### #### #### #### #### #### #### #### #### #### 
## Paris ratchet paper 
## Code to load in data, prep for imputation
#### #### #### #### #### #### #### #### #### #### #### #### 

#### Contents

#' 1. Load in country-level data
#' --. On GHG emissions
#' --. On 2015 NDCs
#' --. On 2021 NDCs
#' --. On climate laws, WDI, V-Dem
#' --. And merge these together
#' 2. Multiply impute missing country-level variables
#' 3. Load in dyadic trade and IO data
#' --. From UN Comtrade for sectors
#' --. From CEPII for years
#' --. For IO memberships
#' 4. Multiply impute missing dyadic trade data
#' 5. Make spatial weights
#' --. Make trade matrices for total, EITE and green trade
#' --. Make IO matrices
#' --. Matrix multiply to climate target data to create spatial weights
#' 6. Merge these together into `data_for_models` object

## NOTE: 
#' Code for DescTools::Winsorize() is from version 0.99.47
#' The syntax is updated in 0.99.55 
#' The updated syntax reads: 
#' Winsorize(x, val = quantile(x, probs = c(0.05, 0.95), na.rm = T))
#' v0.99.47 is available here: 
#' https://cran.r-project.org/src/contrib/Archive/DescTools/DescTools_0.99.47.tar.gz
#' To install this older version of the package, run the following: 
#' package_desctools_url <- "https://cran.r-project.org/src/contrib/Archive/DescTools/DescTools_0.99.54.tar.gz"
#' install.packages(package_desctools_url, repos=NULL, type="source")


#### 0. Initialize #### 

library(tidyverse)
library(readxl)
library(lubridate)
library(data.table)
library(Amelia) # Multiple imputation
library(knitr)
library(DescTools) # Winsorize()
library(broom)
library(modelsummary)
library(lsa) # for cosine similarity

options(scipen=999) # Scientific notation off
# options(scipen=0)  # Scientific notation on

## Check working directory
list.files() 

#' Should see:
#' [1] "_readme_ratchet.html" "_readme_ratchet.Rmd"  "code"                
#' [4] "input"                "output"               "plots"               
#' [7] "text"
#' 
#' If not, set working directory properly


## Define a "max" function that works with NA
my.max <- function(x) ifelse( !all(is.na(x)), max(x, na.rm=T), NA)

## Define core ISO3C codes to use in analysis
core_iso3c <- c("AFG", "AGO", "ALB", "AND", "ARE", "ARG", "ARM", "ATG", "AUS", "AUT", "AZE", "BDI", "BEL", "BEN", "BFA", "BGD", "BGR", "BHR", "BHS", "BIH", "BLR", "BLZ", "BOL", "BRA", "BRB", "BRN", "BTN", "BWA", "CAF", "CAN", "CHE", "CHL", "CHN", "CIV", "CMR", "COD", "COG", "COK", "COL", "COM", "CPV", "CRI", "CUB", "CYP", "CZE", "DEU", "DJI", "DMA", "DNK", "DOM", "DZA", "ECU", "EGY", "ERI", "ESP", "EST", "ETH", "FIN", "FJI", "FRA", "FSM", "GAB", "GBR", "GEO", "GHA", "GIN", "GMB", "GNB", "GNQ", "GRC", "GRD", "GTM", "GUY", "HND", "HRV", "HTI", "HUN", "IDN", "IND", "IRL", "IRN", "IRQ", "ISL", "ISR", "ITA", "JAM", "JOR", "JPN", "KAZ", "KEN", "KGZ", "KHM", "KIR", "KNA", "KOR", "KWT", "LAO", "LBN", "LBR", "LBY", "LCA", "LIE", "LKA", "LSO", "LTU", "LUX", "LVA", "MAR", "MDA", "MDG", "MDV", "MEX", "MHL", "MKD", "MLI", "MLT", "MMR", "MNE", "MNG", "MOZ", "MRT", "MUS", "MWI", "MYS", "NAM", "NER", "NGA", "NIC", "NLD", "NOR", "NPL", "NRU", "NZL", "OMN", "PAK", "PAN", "PER", "PHL", "PLW", "PNG", "POL", "PRK", "PRT", "PRY", "QAT", "ROU", "RUS", "RWA", "SAU", "SDN", "SEN", "SGP", "SLB", "SLE", "SLV", "SOM", "SRB", "SSD", "STP", "SUR", "SVK", "SVN", "SWE", "SWZ", "SYC", "SYR", "TCD", "TGO", "THA", "TJK", "TKM", "TLS", "TON", "TTO", "TUN", "TUR", "TUV", "TZA", "UGA", "UKR", "URY", "USA", "UZB", "VCT", "VEN", "VNM", "VUT", "WSM", "YEM", "ZAF", "ZMB", "ZWE")
# core_iso3c <- c("AFG", "AGO", "ALB", "AND", "ARE", "ARG", "ARM", "ATG", "AUS", "AUT", "AZE", "BDI", "BEL", "BEN", "BFA", "BGD", "BGR", "BHR", "BHS", "BIH", "BLR", "BLZ", "BOL", "BRA", "BRB", "BRN", "BTN", "BWA", "CAF", "CAN", "CHE", "CHL", "CHN", "CIV", "CMR", "COD", "COG", "COK", "COL", "COM", "CPV", "CRI", "CUB", "CYP", "CZE", "DEU", "DJI", "DMA", "DNK", "DOM", "DZA", "ECU", "EGY", "ERI", "ESP", "EST", "ETH", "FIN", "FJI", "FRA", "FSM", "GAB", "GBR", "GEO", "GHA", "GIN", "GMB", "GNB", "GNQ", "GRC", "GRD", "GTM", "GUY", "HND", "HRV", "HTI", "HUN", "IDN", "IND", "IRL", "IRN", "IRQ", "ISL", "ISR", "ITA", "JAM", "JOR", "JPN", "KAZ", "KEN", "KGZ", "KHM", "KIR", "KNA", "KOR", "KWT", "LAO", "LBN", "LBR", "LBY", "LCA", "LIE", "LKA", "LSO", "LTU", "LUX", "LVA", "MAR", "MCO", "MDA", "MDG", "MDV", "MEX", "MHL", "MKD", "MLI", "MLT", "MMR", "MNE", "MNG", "MOZ", "MRT", "MUS", "MWI", "MYS", "NAM", "NER", "NGA", "NIC", "NLD", "NOR", "NPL", "NRU", "NZL", "OMN", "PAK", "PAN", "PER", "PHL", "PLW", "PNG", "POL", "PRK", "PRT", "PRY", "PSE", "QAT", "ROU", "RUS", "RWA", "SAU", "SDN", "SEN", "SGP", "SLB", "SLE", "SLV", "SMR", "SOM", "SRB", "SSD", "STP", "SUR", "SVK", "SVN", "SWE", "SWZ", "SYC", "SYR", "TCD", "TGO", "THA", "TJK", "TKM", "TLS", "TON", "TTO", "TUN", "TUR", "TUV", "TZA", "UGA", "UKR", "URY", "USA", "UZB", "VCT", "VEN", "VNM", "VUT", "WSM", "YEM", "ZAF", "ZMB", "ZWE")

#### 1. Load in country-level panel data ####

### Load in GHG data

## Load in PRIMAP GHG data
primap <- read_csv("input/primap_v231.csv") %>% 
  filter(entity == "KYOTOGHG (AR4GWP100)") %>% # Aggregate using Kyoto GWP rules
  filter(`scenario (PRIMAP-hist)` == "HISTTP") %>% # Prioritize third-party emissions data over country data
  filter(`category (IPCC2006_PRIMAP)` == "M.0.EL") %>%  # Include all sectors
  pivot_longer(names_to = "year", values_to = "ghg_primap", `1980`:`2019`) %>% 
  select(iso3c = `area (ISO3)`, year, ghg_primap) %>% 
  filter(year > 1989) %>% 
  filter(!iso3c %in% c("ANNEXI", "AOSIS", "BASIC", "EARTH",
                       "EU28", "EU27BX", "NONANNEXI", "UMBRELLA")) %>% 
  filter(iso3c %in% core_iso3c) %>% 
  mutate(year = as.numeric(year)) %>% 
  mutate(ghg_primap = ghg_primap/1000) # into MtCO2

## Get LULUCF status for countries' targets
lulucf_flag <- read_csv("input/cw_ndc_data_highlevel_20220928.csv") %>% 
  select(iso3c = ISO, ndc_date, coverage_sectors_label) %>% 
  filter(iso3c %in% core_iso3c) %>% 
  group_by(iso3c) %>% 
  mutate(count = row_number()) %>% 
  filter(count == max(count)) %>%
  select(-count, -ndc_date) %>% 
  summarize(iso3c, 
            lulucf_flag = str_detect(coverage_sectors_label, 
                                     "including LULUCF"))

## Load in WRI/CAIT/Climate Watch data
cw <- read_csv("input/cw_ghg_20220928.csv") %>% 
  filter(Sector %in% c("Total excluding LUCF", "Total including LUCF"),
         Gas == "All GHG") %>% 
  pivot_longer(names_to = "year", values_to = "cw_ghg_", `1990`:`2019`) %>% 
  pivot_wider(names_from = "Sector", values_from = "cw_ghg_", ) %>% 
  select(iso3c = Country, year, 
         ghg_cw_ex  = `Total excluding LUCF`, 
         ghg_cw_lucf = `Total including LUCF`) %>% 
  mutate(year = as.numeric(year)) %>% 
  filter(iso3c %in% core_iso3c) %>% 
  as.data.frame() %>% 
  full_join(., lulucf_flag, by = 'iso3c') %>% 
  mutate(ghg_cw_chosen = ifelse(lulucf_flag == T, ghg_cw_lucf, ghg_cw_ex))


### Load in 2015 NDC targets data

## Load in Paris targets data
ndc2015 <- read_csv("input/ndc_2015_reconciled.csv") %>% 
  filter(iso3c %in% core_iso3c) %>% 
  arrange(iso3c) %>% 
  # Make proper date variables
  mutate(date = as_date("2015-12-12")) %>% 
  relocate(date, .after = ("country")) %>% 
  # Remove 'MtCO2' from values
  mutate_at(vars(c(cw_ndc1_ghgtarget_scenario_referencelevel,
                   cw_ndc1_ghgtarget_fixedlevel_goal,
                   cw_ndc1_ghgtarget_fixedlevel_goal_stretch)), ~parse_number(.)) %>% 
  mutate(cw_ndc1_ghgtarget_type = 
           ifelse(cw_ndc1_ghgtarget_type == "Base year", 
                  "Base Year", 
                  ifelse(cw_ndc1_ghgtarget_type == "Fixed level", 
                         "Fixed Level", cw_ndc1_ghgtarget_type)))


## Merge GHG data and 2015 NDCs to get target levels
targets2015 <- full_join(primap, cw, by = c("iso3c", "year")) %>% 
  full_join(., ndc2015 , by = "iso3c") %>% 
  arrange(iso3c, year) %>% 
  # Get emissions in base year and GHG 2010 as own variables
  mutate(ndc1_baseyear_emissions = ifelse((cw_ndc1_ghgtarget_type == "Base Year" &
                                             year == cw_ndc1_ghgtarget_referenceyear),
                                          ghg_cw_chosen, NA),
         ghg_cw_chosen_2000 = ifelse(year == 2000, ghg_cw_chosen, NA),
         ghg_cw_chosen_2010 = ifelse(year == 2010, ghg_cw_chosen, NA),
         ghg_cw_chosen_2015 = ifelse(year == 2015, ghg_cw_chosen, NA)) %>% 
  group_by(iso3c) %>% 
  mutate(ndc1_baseyear_emissions = my.max(ndc1_baseyear_emissions),
         ghg_cw_chosen_2000 = my.max(ghg_cw_chosen_2000),
         ghg_cw_chosen_2010 = my.max(ghg_cw_chosen_2010),
         ghg_cw_chosen_2015 = my.max(ghg_cw_chosen_2015)) %>% 
  ungroup() %>% 
  # Get absolute emissions levels in targets
  mutate(ndc1_absolute_uncond = 
           ifelse(cw_ndc1_ghgtarget_type == "Fixed Level", cw_ndc1_ghgtarget_fixedlevel_goal, # For fixed level targets
                  ifelse(cw_ndc1_ghgtarget_type == "Scenario", # For scenario targets
                         cw_ndc1_ghgtarget_scenario_referencelevel*(1-cw_ndc1_ghgtarget_scenario_goal/100),
                         ifelse(cw_ndc1_ghgtarget_type == "Base Year", # For base year targets
                                ndc1_baseyear_emissions*(1-cw_ndc1_ghgtarget_baseyear_goal/100), NA)))) %>% 
  mutate(ndc1_absolute_cond = 
           ifelse(cw_ndc1_ghgtarget_type == "Fixed Level", cw_ndc1_ghgtarget_fixedlevel_conditional, # For fixed level targets
                  ifelse(cw_ndc1_ghgtarget_type == "Scenario", # For scenario targets
                         cw_ndc1_ghgtarget_scenario_referencelevel*(1-cw_ndc1_ghgtarget_scenario_goal_conditional/100),
                         ifelse(cw_ndc1_ghgtarget_type == "Base Year", # For base year targets
                                ndc1_baseyear_emissions*(1-cw_ndc1_ghgtarget_baseyear_goal_conditional/100), NA)))) %>% 
  # Get emissions levels in targets as percentage of 2010 emissions (same benchmark as PRIMAP)
  mutate(ndc1_percentage_uncond = ndc1_absolute_uncond/ghg_cw_chosen_2010,
         ndc1_percentage_cond = ndc1_absolute_cond/ghg_cw_chosen_2010) %>% 
  # Get an average across both NDC targets (unconditional and conditional)
  rowwise() %>% 
  mutate(ndc1_percentage_average = mean(c(
    ndc1_percentage_uncond, ndc1_percentage_cond), na.rm=T)) %>% 
  mutate(ndc1_absolute_average = mean(c(
    ndc1_absolute_uncond, ndc1_absolute_cond), na.rm=T)) %>% 
  # Re-scale the targets so that higher values are cuts, zero is constant
  mutate_at(vars(ndc1_percentage_uncond, ndc1_percentage_cond, ndc1_percentage_average),
            ~(1-.)*100) %>% 
  # Get GHG growth rates
  mutate(ghg_growth_to_2015 = (ghg_cw_chosen_2015/ghg_cw_chosen_2000)^(1/15),
         ghg_growth_to_ndc1 = (ndc1_absolute_average/ghg_cw_chosen_2015)^(1/15)) %>% 
  mutate(growth_pp_pre = (ghg_growth_to_2015 - 1)*100,
         growth_pp_post_ndc1 = (ghg_growth_to_ndc1 - 1)*100,
         ndc1_delta_growth = growth_pp_post_ndc1 - growth_pp_pre) %>% 
  # Drop repeated observations
  select(-year, -ghg_primap, -ghg_cw_ex, -ghg_cw_lucf, -ghg_cw_chosen) %>% 
  distinct() %>% 
  filter(iso3c %in% core_iso3c) %>% 
  # Keep only the core variables
  select(iso3c, country, ghg_cw_chosen_2010, ghg_cw_chosen_2015,
         ndc1_absolute_average, ndc1_absolute_uncond, ndc1_absolute_cond,
           ndc1_percentage_uncond, ndc1_percentage_cond, ndc1_percentage_average,
         ndc1_delta_growth, growth_pp_post_ndc1, growth_pp_pre) %>%
  mutate_at(vars(ghg_cw_chosen_2010:growth_pp_pre), ~replace(., is.na(.), NA)) %>% 
  as.data.frame()


### Load in 2021 NDC targets data

## Load in SR 2021 NDC data
ndc2021 <- read_excel("input/ndc_sr_2021.xlsx", sheet = "2021 Dataset") %>% 
  arrange(iso3c) %>% 
  # Make proper date variables
  mutate(date = lubridate::as_date(submission_date_yy_mm_dd)) %>% 
  relocate(date, .after = ("country")) %>% 
  select(-submission_date_yy_mm_dd) %>% 
  # Remove 'MtCO2' from values
  mutate_at(vars(c(cw_ndc2_ghgtarget_scenario_referencelevel,
                   cw_ndc2_ghgtarget_fixedlevel_goal,
                   cw_ndc2_ghgtarget_fixedlevel_goal_stretch,
                   cw_ndc2_ghgtarget_fixedlevel_conditional)), ~parse_number(.))

# Merge 2021 targets with GHG time series
targets2021 <- full_join(primap, cw, by = c("iso3c", "year")) %>%
  full_join(., ndc2021 , by = "iso3c", relationship = "many-to-many") %>% 
  arrange(iso3c, year) %>% 
  # Get emissions in base year and GHG 2010 as own variables
  mutate(ndc2_baseyear_emissions = ifelse((cw_ndc2_ghgtarget_type == "Base Year" &
                                             year == cw_ndc2_ghgtarget_referenceyear),
                                          ghg_cw_chosen, NA),
         ghg_cw_chosen_2000 = ifelse(year == 2000, ghg_cw_chosen, NA),
         ghg_cw_chosen_2010 = ifelse(year == 2010, ghg_cw_chosen, NA),
         ghg_cw_chosen_2015 = ifelse(year == 2015, ghg_cw_chosen, NA)) %>% 
  group_by(iso3c) %>% 
  mutate(ndc2_baseyear_emissions = my.max(ndc2_baseyear_emissions),
         ghg_cw_chosen_2000 = my.max(ghg_cw_chosen_2000),
         ghg_cw_chosen_2010 = my.max(ghg_cw_chosen_2010),
         ghg_cw_chosen_2015 = my.max(ghg_cw_chosen_2015)) %>% 
  ungroup() %>% 
  # Get absolute emissions levels in targets
  ungroup() %>% 
  mutate(ndc2_absolute_uncond = 
           ifelse(cw_ndc2_ghgtarget_type == "Fixed Level", cw_ndc2_ghgtarget_fixedlevel_goal, # For fixed level targets
                  ifelse(cw_ndc2_ghgtarget_type == "Scenario", # For scenario
                         cw_ndc2_ghgtarget_scenario_referencelevel*(1-cw_ndc2_ghgtarget_scenario_unconditional_goal/100),
                         ifelse(cw_ndc2_ghgtarget_type == "Base Year", # For base year
                                ndc2_baseyear_emissions*(1-cw_ndc2_ghgtarget_baseyear_goal/100), NA)))) %>% 
  mutate(ndc2_absolute_cond = 
           ifelse(cw_ndc2_ghgtarget_type == "Fixed Level", cw_ndc2_ghgtarget_fixedlevel_conditional,
                  ifelse(cw_ndc2_ghgtarget_type == "Scenario",
                         cw_ndc2_ghgtarget_scenario_referencelevel*(1-cw_ndc2_ghgtarget_scenario_goal_conditional/100),
                         ifelse(cw_ndc2_ghgtarget_type == "Base Year",
                                ndc2_baseyear_emissions*(1-cw_ndc2_ghgtarget_baseyear_goal_conditional/100), NA)))) %>% 
  # Get emissions levels in targets as percentage of 2010 emissions (same benchmark as PRIMAP)
  mutate(ndc2_percentage_uncond = ndc2_absolute_uncond/ghg_cw_chosen_2010,
       ndc2_percentage_cond = ndc2_absolute_cond/ghg_cw_chosen_2010) %>% 
  # Get an average across both NDC targets (unconditional and conditional)
  rowwise() %>% 
  mutate(ndc2_percentage_average = mean(c(
      ndc2_percentage_uncond, ndc2_percentage_cond), na.rm=T)) %>% 
  mutate(ndc2_absolute_average = mean(c(
    ndc2_absolute_uncond, ndc2_absolute_cond), na.rm=T)) %>% 
  # Re-scale the targets so that higher values are cuts, zero is constant
  mutate_at(vars(ndc2_percentage_uncond, ndc2_percentage_cond, ndc2_percentage_average),
            ~(1-.)*100) %>% 
  ungroup() %>% 
  # Get GHG growth rates
  mutate(ghg_growth_to_2015 = (ghg_cw_chosen_2015/ghg_cw_chosen_2000)^(1/15),
         ghg_growth_to_ndc2 = (ndc2_absolute_average/ghg_cw_chosen_2015)^(1/15)) %>% 
  mutate(growth_pp_pre = (ghg_growth_to_2015 - 1)*100,
         growth_pp_post_ndc2 = (ghg_growth_to_ndc2 - 1)*100,
         ndc2_delta_growth = growth_pp_post_ndc2 - growth_pp_pre) %>% 
  # Drop repeated observations
  select(-year, -ghg_primap, -ghg_cw_ex, -ghg_cw_lucf, -ghg_cw_chosen) %>% 
  distinct() %>% 
  # Keep only "final" NDC 
  group_by(iso3c) %>% 
  mutate(entry = row_number(),
         entry_max = my.max(entry)) %>% 
  ungroup() %>% 
  filter(entry==entry_max) %>% 
  filter(iso3c %in% core_iso3c) %>%
  # Keep only the core variables
  select(iso3c, country, ndc2_date = date, ghg_cw_chosen_2010, 
         ndc2_absolute_average, ndc2_absolute_uncond, ndc2_absolute_cond,
         ndc2_percentage_uncond, ndc2_percentage_cond, ndc2_percentage_average,
         ndc2_delta_growth, growth_pp_post_ndc2) %>%
  mutate_at(vars(ghg_cw_chosen_2010:growth_pp_post_ndc2), ~replace(., is.na(.), NA)) %>% 
  as.data.frame()

### Load in climate laws data

climate_laws <- read_csv("input/grantham_climate_laws_panel.csv") %>% 
  arrange(iso3c, year) %>% 
  filter(year %in% c(1990:2019)) %>% 
  mutate(climate_laws2010 = ifelse(year == 2010, climate_laws, NA),
         climate_laws2014 = ifelse(year == 2014, climate_laws, NA),
         climate_laws2019 = ifelse(year == 2019, climate_laws, NA)) %>% 
  group_by(iso3c) %>% 
  mutate_at(vars(climate_laws2010:climate_laws2019), ~my.max(.)) %>% 
  ungroup() %>% 
  mutate(climate_laws_new20142019 = climate_laws2019 - climate_laws2014) %>% 
  select(iso3c, year, climate_laws, climate_laws_new20142019)
  
### Load in World Development Indicators data

## NOT RUN --- Saved to disk, and loaded from there
# library(WDI)
# indicators <- c('NY.GDP.MKTP.KD', # NY.GDP.MKTP.KD gdp_mkt : constant 2010 usd
#                 'NY.GDP.PCAP.KD', # NY.GDP.PCAP.KD gdppc_mkt : gdp per capita, constant 2010 usd
#                 'NE.EXP.GNFS.ZS', # Exports as share of GDP
#                 'NE.IMP.GNFS.ZS', # Imports as share of GDP
#                 'NV.IND.TOTL.ZS', # Industry as share of GDP
#                 'NY.GDP.COAL.RT.ZS', # Coal rents as share of GDP
#                 'NY.GDP.NGAS.RT.ZS', # Natural gas rents as share of GDP
#                 'NY.GDP.PETR.RT.ZS', # Oil rents as share of GDP
#                 'EG.ELC.RNWX.ZS', # Renewables as a share of electricity
#                 'EG.ELC.ACCS.ZS', # Access to electricity as a share of population
#                 'SP.POP.TOTL') # SP.POP.TOTL : population, total
# 
# # pull indicators from the WDI online
# wdi_raw <- WDI(indicator = indicators, start = 1990, end = 2019, extra = TRUE) 
# 
# wdi <- wdi_raw %>%
#   filter(income!="Aggregates") %>%
#   select(iso3c, country, year,
#          gdp = 'NY.GDP.MKTP.KD',
#          gdp_capita = 'NY.GDP.PCAP.KD',
#          exports = 'NE.EXP.GNFS.ZS',
#          imports = 'NE.IMP.GNFS.ZS',
#          industry = 'NV.IND.TOTL.ZS',
#          coal_rents = 'NY.GDP.COAL.RT.ZS',
#          gas_rents = 'NY.GDP.NGAS.RT.ZS',
#          oil_rents = 'NY.GDP.PETR.RT.ZS',
#          renewables_share = 'EG.ELC.RNWX.ZS',
#          electricity_access = 'EG.ELC.ACCS.ZS',
#          population = 'SP.POP.TOTL',
#          region,
#          income) %>%
#   arrange(iso3c, year) %>%
#   mutate_at(vars(year:population), ~as.numeric(.)) %>%
#   mutate(fossil_rents = coal_rents + gas_rents + oil_rents) %>%
#   select(-coal_rents, -oil_rents, -gas_rents) %>%
#   relocate(fossil_rents, .after = industry) %>%
#   mutate_at(vars(gdp, gdp_capita, exports, imports, fossil_rents, population),
#             ~log(1 + . )) %>%
#   mutate(income = replace(income, iso3c == "VEN", "Upper middle income")) %>%
#   filter(iso3c %in% core_iso3c)
# write_csv(wdi, file = "input/wdi_20230920.csv")

wdi <- read_csv("input/wdi_20230920.csv")
  

### Load in IOs data
# Code to make this is in 99_io_matrix.R

io_membership_cs <- read_csv("input/io_membership_cs.csv") 


### Load in V-Dem data

vdem <- read_csv("input/vdem.csv") %>% 
  filter(year >= 1990) %>% 
  select(iso3c = country_text_id,
         year,
         vdem = vdem_liberal_democracy_index) %>% 
  filter(iso3c %in% core_iso3c) %>%
  arrange(iso3c, year)


### Merge all country-level data together

country_level <- full_join(targets2015, 
                           select(targets2021, -country, -ghg_cw_chosen_2010), 
                           by = c('iso3c')) %>% 
  left_join(., cw, by = c("iso3c")) %>% 
  left_join(., primap, by = c("iso3c", "year")) %>% 
  left_join(., climate_laws, by = c("iso3c", "year")) %>% 
  left_join(., select(wdi, -country), by = c("iso3c", "year")) %>% 
  left_join(., io_membership_cs, by = "iso3c") %>% 
  left_join(., vdem, by = c("iso3c", "year")) %>% 
  mutate(safe_ndc1_percentage_average = ndc1_percentage_average,
         safe_ndc1_delta_growth = ndc1_delta_growth,
         safe_ndc2_percentage_average = ndc2_percentage_average,
         safe_ndc2_delta_growth = ndc2_delta_growth) %>% 
  select(iso3c, country, year, starts_with("safe"), everything()) %>% 
  # Standardize some variables by income groups
  group_by(income) %>% 
  mutate(z_ndc1_percentage_average = scale(ndc1_percentage_average),
         z_ndc1_delta_growth = scale(ndc1_delta_growth),
         z_climate_laws = scale(climate_laws),
         z_climate_laws_new20142019 = scale(climate_laws_new20142019)) %>% 
  ungroup() %>% 
  mutate(z_ndc1_percentage_average = as.numeric(z_ndc1_percentage_average),
         z_ndc1_delta_growth = as.numeric(z_ndc1_delta_growth),
         z_climate_laws = as.numeric(z_climate_laws),
         z_climate_laws_new20142019 = as.numeric(z_climate_laws_new20142019)) %>% 
  # filter(year>=2010) %>% 
  as.data.frame()

#### 2. Multiple imputation for the targets and covariates ####

## Set bounds for the Paris and Glasgow targets
column_location_ndc1_percentage_average <- which(colnames(country_level)=="ndc1_percentage_average")
column_location_ndc2_percentage_average <- which(colnames(country_level)=="ndc2_percentage_average")

bds_country <- matrix(rbind(c(column_location_ndc1_percentage_average, -300, 100),
                    c(column_location_ndc2_percentage_average, -300, 100)),
                    nrow = 2, ncol = 3)

## Impute
start_time <- Sys.time()
country_level_impute <- country_level %>% 
  amelia(x = .,
         m = 15,
         p2s = 1,
         cs = "iso3c", 
         ts = "year",
         # polytime = 2, 
         # intercs = F,
         idvars = c("country", 
                    "safe_ndc1_percentage_average",
                    "safe_ndc1_delta_growth",
                    "safe_ndc2_percentage_average",
                    "safe_ndc2_delta_growth",
                    "ndc2_date"),  # Don't impute these
         noms = c("region", "income", "lulucf_flag"),
         bounds = bds_country,
         max.resample = 100,
         parallel = "multicore", ncpus = 6, # Parallel messes with the seed/reproducibility (8 cores on iMac)
         empri = 0.01*nrow(country_level), # Helps with highly correlated variables
         incheck = T)
finish_time <- Sys.time() # About 30 sec per imputation; about 100 sec for the whole thing
finish_time - start_time
#' Message: 
#' The variables (or variable with levels) ndc1_absolute_cond, ndc1_percentage_average, growth_pp_pre, ndc2_absolute_cond, ndc2_percentage_average, growth_pp_post_ndc2, z_climate_laws_new20142019 are perfectly collinear with another variable in the data.

# saveRDS(country_level_impute, "output/country_level_impute_20230920.rds")
# country_level_impute <- readRDS("output/country_level_impute_20230920.rds")


### Then average these to merge onto the dyadic data

names(country_level_impute$imputations$imp1)
character_cols <- c("year", "iso3c", "country", "region", "income", "ndc2_date", 
                    "safe_ndc1_percentage_average", "safe_ndc1_delta_growth",
                    "safe_ndc2_percentage_average", "safe_ndc2_delta_growth")
numeric_cols <- c("ghg_cw_chosen_2010", "ndc1_absolute_average", 
                  "ndc1_absolute_uncond", "ndc1_absolute_cond", 
                  "ndc1_percentage_uncond", "ndc1_percentage_cond", 
                  "ndc1_percentage_average", "ndc1_delta_growth",
                  "growth_pp_post_ndc1", "growth_pp_pre", 
                  "ndc2_absolute_average", "ndc2_absolute_uncond", 
                  "ndc2_absolute_cond", "ndc2_percentage_uncond", 
                  "ndc2_percentage_cond", "ndc2_percentage_average",
                  "ndc2_delta_growth", "growth_pp_post_ndc2", "ghg_cw_ex", 
                  "ghg_cw_lucf", "lulucf_flag", "ghg_cw_chosen", 
                  "ghg_primap", "climate_laws", "climate_laws_new20142019",
                  "gdp", "gdp_capita", "exports", "imports", "industry", 
                  "fossil_rents", "renewables_share", "electricity_access",
                  "population", "state_io_memberships", "vdem", "z_ndc1_percentage_average", 
                  "z_ndc1_delta_growth", "z_climate_laws", "z_climate_laws_new20142019")

character_pos <- which(names(country_level_impute$imputations$imp1) %in% character_cols)
numeric_pos <- which(names(country_level_impute$imputations$imp1) %in% numeric_cols)

## Time-invariant country level variables to average
time_invariant_country_to_average <- c(
  "ghg_cw_chosen_2010", "ndc1_absolute_average", 
  "ndc1_absolute_uncond", "ndc1_absolute_cond", 
  "ndc1_percentage_uncond", "ndc1_percentage_cond", 
  "ndc1_percentage_average", "ndc1_delta_growth",
  "growth_pp_post_ndc1", "growth_pp_pre", 
  "ndc2_absolute_average", "ndc2_absolute_uncond", 
  "ndc2_absolute_cond", "ndc2_percentage_uncond", 
  "ndc2_percentage_cond", "ndc2_percentage_average",
  "ndc2_delta_growth", "growth_pp_post_ndc2", 
  "lulucf_flag", "climate_laws_new20142019",
  "state_io_memberships", 
  "z_ndc1_percentage_average", "z_ndc1_delta_growth",
  "z_climate_laws", "z_climate_laws_new20142019")

## Not these: 
# c("ghg_cw_ex", "ghg_cw_lucf", "ghg_cw_chosen", 
#   "ghg_primap", "climate_laws", "gdp", "gdp_capita", 
#   "exports", "imports", "industry", "fossil_rents", 
#   "renewables_share", "electricity_access", 
#   "population", "vdem")

## Average the numeric variables
country_level_average <- cbind.data.frame(country_level_impute$imputations$imp1[,character_pos],
                                            cbind(country_level_impute$imputations$imp1[,numeric_pos] + 
                                                    country_level_impute$imputations$imp2[,numeric_pos] +
                                                    country_level_impute$imputations$imp3[,numeric_pos] +
                                                    country_level_impute$imputations$imp4[,numeric_pos] +
                                                    country_level_impute$imputations$imp5[,numeric_pos] +
                                                    country_level_impute$imputations$imp6[,numeric_pos] +
                                                    country_level_impute$imputations$imp7[,numeric_pos] +
                                                    country_level_impute$imputations$imp8[,numeric_pos] +
                                                    country_level_impute$imputations$imp9[,numeric_pos] +
                                                    country_level_impute$imputations$imp10[,numeric_pos] +
                                                    country_level_impute$imputations$imp11[,numeric_pos] +
                                                    country_level_impute$imputations$imp12[,numeric_pos] +
                                                    country_level_impute$imputations$imp13[,numeric_pos] +
                                                    country_level_impute$imputations$imp14[,numeric_pos] +
                                                    country_level_impute$imputations$imp15[,numeric_pos])/15) %>% 
  # Average imputed time-invariant country-level variables
  group_by(iso3c) %>%
  mutate_at(vars(all_of(time_invariant_country_to_average)),
            ~mean(.)) %>%
  ungroup() %>% 
  as.data.frame()
  
# saveRDS(country_level_average, "output/country_level_average_20230920.rds")
# country_level_average <- readRDS("output/country_level_average_20230920.rds")


#### 3. Load in dyadic trade and IO data ####

### Load in the COMTRADE dyadic data for the CBAM sectors
comtrade_buckets <- fread("input/comtrade_total_eite_green_years.csv", 
                          header = T, 
                          select = c(1:6)) %>% 
  filter(trade_flow == "Export") %>% 
  pivot_wider(names_from = bucket, 
              values_from = trade_value_sum, 
              names_prefix = "trade_") %>% 
  mutate_at(vars(trade_EITE:trade_Green), ~ifelse(is.na(.), 0, as.numeric(.))) %>% 
  select(main_iso = reporter_iso, 
         counterpart_iso = partner_iso, 
         year, 
         trade_total = trade_Total, 
         trade_eite = trade_EITE, 
         trade_green = trade_Green) %>% 
  as.data.frame()

### -- Missing and non-missing trade data

# Create an empty dyadic panel, square, to then merge the export data onto
square_exports_panel <- cbind.data.frame(
  main_iso = rep(core_iso3c,
                 each = length(core_iso3c)*5),
  counterpart_iso = rep(core_iso3c, 
                        times = length(core_iso3c)*5),
  year = rep(rep(seq(2015, 2019, by = 1),
                 each = length(core_iso3c)), times = length(core_iso3c))) %>% 
  # Merge the CBAM data
  left_join(., comtrade_buckets, 
            by = c("main_iso", "counterpart_iso", "year")) %>% 
  # Delete duplicate dyads
  filter(main_iso %in% core_iso3c,
         counterpart_iso %in% core_iso3c,
         main_iso != counterpart_iso) %>% 
  # Flag and count missing data
  rowwise() %>% 
  mutate(is_missing = ifelse((is.na(trade_total) | is.na(trade_eite) | is.na(trade_green)), 1, 0)) %>% 
  group_by(main_iso, year) %>% 
  mutate(sum_missing = sum(is_missing)) %>% 
  ungroup() %>% 
  # Replace missing data with 0 for countries that report, but leave NA for countries that don't report
  rowwise() %>% 
  mutate_at(vars(trade_total, trade_eite, trade_green),
            ~ifelse(is.na(.) & sum_missing != 191, 0, .)) %>% 
  select(-is_missing, -sum_missing) %>% 
  as.data.frame()
# square_exports_panel


### Load in the CEPII dyadic data

## `_o` --> "origin", the exporter
## `_d` --> "destination", the importer
## Dyads appear twice --> once as exporter, once as importer

cepii_gravity <- readRDS("input/Gravity_V202211.rds") %>% 
  filter(year>=2010, year<2020,
         country_id_d != country_id_o,
         country_exists_d == 1,
         country_exists_o == 1,
         iso3_o %in% core_iso3c,
         iso3_d %in% core_iso3c) %>% 
  select(year, iso3_o, iso3_d, 
         tradeflow_baci, 
         contig, distw_harmonic, diplo_disagreement, comlang_ethno,
         comcol, col45, fta_wto) %>% 
  as.data.frame()

### Load in dyadic IO data
# Code to make this is in 99_io_matrix.R
io_dyadic <- read_csv("input/io_membership_dyadic.csv")


## Merge CEPII data onto the covariate/WDI data, twice --- once for each ISO
data_for_imputation <- full_join(cepii_gravity, square_exports_panel,
                                 by = c("iso3_o" = "main_iso",
                                        "iso3_d" = "counterpart_iso",
                                        "year" = "year")) %>%
  left_join(., io_dyadic, by = c("iso3_o" = "iso3_a",
                                  "iso3_d" = "iso3_b")) %>%
  relocate(c(trade_total, trade_eite, trade_green), .after = "tradeflow_baci") %>%
  full_join(., country_level_average, by = c("iso3_o" = "iso3c", "year")) %>%
  full_join(., country_level_average, by = c("iso3_d" = "iso3c", "year"),
            suffix = c("_o", "_d")) %>%
  filter(iso3_o!=1,
         iso3_d!=1) %>%
  filter(iso3_o %in% core_iso3c,
         iso3_d %in% core_iso3c) %>%
  as.data.frame()


# saveRDS(data_for_imputation, "output/data_for_imputation20230920.rds")
# data_for_imputation <- readRDS("output/data_for_imputation20230920.rds")

## Clear objects from workspace
# rm(cepii_gravity, climate_laws, comtrade_buckets, cw, 
#    io_dyadic, io_membership_cs, lulucf_flag, ndc2015, 
#    ndc2021, primap, square_exports_panel, targets2015, 
#    targets2021, vdem, wdi)


#### 4. Multiple imputation of the dyadic data #### 

#' This is the 'double'/second imputation
#' First doing the 'panel'/cross-section (above)
#' Then merging this to the dyadic data 
#' And imputing again (here)

## Set bounds for the `tradeflow_baci` variable
column_location_baci <- which(colnames(data_for_imputation)=="tradeflow_baci")
column_location_total <- which(colnames(data_for_imputation)=="trade_total")
column_location_eite <- which(colnames(data_for_imputation)=="trade_eite")
column_location_green <- which(colnames(data_for_imputation)=="trade_green")

bds <- matrix(rbind(c(column_location_baci, 0, 5e8),
                    c(column_location_total, 0, 4.8e11),
                    c(column_location_eite, 0, 3.4e10),
                    c(column_location_green, 0, 4.4e9)),
              nrow = 4, ncol = 3)

## Impute 15 datasets
start_time <- Sys.time()
full_impute <- data_for_imputation %>% 
  mutate(dyad_id = paste(iso3_o, iso3_d, sep = "_")) %>% 
  amelia(x = .,
         m = 15,
         p2s = 1,
         cs = "dyad_id", 
         ts = "year",
         # polytime = 2,
         # intercs = F,
         idvars = c("iso3_o", "iso3_d", "country_o", "country_d",
                    "safe_ndc1_percentage_average_o",
                    "safe_ndc1_delta_growth_o",
                    "safe_ndc2_percentage_average_o",
                    "safe_ndc2_delta_growth_o",
                    "ndc2_date_o", "ndc2_date_d",
                    "safe_ndc1_percentage_average_d",
                    "safe_ndc1_delta_growth_d",
                    "safe_ndc2_percentage_average_d",
                    "safe_ndc2_delta_growth_d"),  # Don't impute these
         noms = c("contig", "comlang_ethno", "comcol", "col45", "fta_wto",
                  "region_o", "region_d", "income_o", "income_d",
                  "lulucf_flag_o", "lulucf_flag_d"),
         logs = c("tradeflow_baci", "trade_total", "trade_eite", "trade_green"),
         bounds = bds, # lower bound on tradeflow_baci
         max.resample = 100,
         parallel = "multicore", ncpus = 4, # Parallel messes with the seed/reproducibility (8 cores on iMac)
         empri = 0.01*nrow(data_for_imputation), # Helps with highly correlated variables
         incheck = T)
finish_time <- Sys.time() 
finish_time - start_time # 7' for one, 55 iterations; 41' on 4 cores

# saveRDS(full_impute, "output/full_imputed_data_20230920.rds")


### Average these datasets

## Group variables that are nominal versus numeric

# names(full_impute$imputations$imp1)
character_cols <- c("year", "iso3_o", "iso3_d", "dyad_id",
                    # _o
                    "country_o", "region_o", "income_o", 
                    "safe_ndc1_percentage_average_o",
                    "safe_ndc1_delta_growth_o", 
                    "safe_ndc2_percentage_average_o",
                    "safe_ndc2_delta_growth_o",
                    "ndc2_date_o",
                    "lulucf_flag_o",
                    # _d
                    "country_d", "region_d", "income_d", 
                    "safe_ndc1_percentage_average_d",
                    "safe_ndc1_delta_growth_d", 
                    "safe_ndc2_percentage_average_d",
                    "safe_ndc2_delta_growth_d",
                    "ndc2_date_d",
                    "lulucf_flag_d")
numeric_cols <- c("tradeflow_baci", "trade_total", "trade_eite", "trade_green",
                  "sum_common_io",
                  # _o
                  "ndc1_percentage_uncond_o", 
                  "ndc1_percentage_cond_o",
                  "ndc1_percentage_average_o",
                  "ndc1_delta_growth_o", 
                  "ndc1_absolute_average_o",
                  "ndc1_absolute_uncond_o",
                  "ndc1_absolute_cond_o",
                  "growth_pp_post_ndc1_o",
                  "growth_pp_pre_o",
                  "ndc2_absolute_average_o",
                  "ndc2_absolute_uncond_o",
                  "ndc2_absolute_cond_o",
                  "ndc2_percentage_uncond_o",
                  "ndc2_percentage_cond_o",
                  "ndc2_percentage_average_o",
                  "ndc2_delta_growth_o",
                  "growth_pp_post_ndc2_o",
                  "ghg_cw_chosen_o",
                  "ghg_primap_o",
                  "climate_laws_new20142019_o",
                  "vdem_o",
                  "z_ndc1_percentage_average_o",
                  "z_ndc1_delta_growth_o",
                  "z_climate_laws_o",
                  "z_climate_laws_new20142019_o",
                  "gdp_o",
                  "gdp_capita_o", 
                  "exports_o", 
                  "imports_o", 
                  "industry_o",
                  "fossil_rents_o", 
                  "renewables_share_o",
                  "electricity_access_o",
                  "population_o",
                  "ghg_cw_chosen_2010_o",
                  "climate_laws_o",
                  "ghg_cw_ex_o",
                  "ghg_cw_lucf_o",
                  "state_io_memberships_o",
                  # _d
                  "ndc1_percentage_uncond_d", 
                  "ndc1_percentage_cond_d",
                  "ndc1_percentage_average_d",
                  "ndc1_delta_growth_d", 
                  "ndc1_absolute_average_d",
                  "ndc1_absolute_uncond_d",
                  "ndc1_absolute_cond_d",
                  "growth_pp_post_ndc1_d",
                  "growth_pp_pre_d",
                  "ndc2_absolute_average_d",
                  "ndc2_absolute_uncond_d",
                  "ndc2_absolute_cond_d",
                  "ndc2_percentage_uncond_d",
                  "ndc2_percentage_cond_d",
                  "ndc2_percentage_average_d",
                  "ndc2_delta_growth_d",
                  "growth_pp_post_ndc2_d",
                  "ghg_cw_chosen_d",
                  "ghg_primap_d",
                  "climate_laws_new20142019_d",
                  "vdem_d",
                  "z_ndc1_percentage_average_d",
                  "z_ndc1_delta_growth_d",
                  "z_climate_laws_d",
                  "z_climate_laws_new20142019_d",
                  "gdp_d",
                  "gdp_capita_d", 
                  "exports_d", 
                  "imports_d", 
                  "industry_d",
                  "fossil_rents_d", 
                  "renewables_share_d",
                  "electricity_access_d",
                  "population_d",
                  "ghg_cw_chosen_2010_d",
                  "climate_laws_d",
                  "ghg_cw_ex_d",
                  "ghg_cw_lucf_d",
                  "state_io_memberships_d")
drop_cols <- c("contig", "distw_harmonic", "diplo_disagreement",
               "comlang_ethno", "comcol", "col45", "fta_wto")

character_pos <- which(names(full_impute$imputations$imp1) %in% character_cols)
drop_pos <- which(names(full_impute$imputations$imp1) %in% drop_cols)
numeric_pos <- which(names(full_impute$imputations$imp1) %in% numeric_cols)

full_imputed_averaged <- cbind.data.frame(full_impute$imputations$imp1[,character_pos],
                                            cbind(full_impute$imputations$imp1[,numeric_pos] + 
                                                    full_impute$imputations$imp2[,numeric_pos] +
                                                    full_impute$imputations$imp3[,numeric_pos] +
                                                    full_impute$imputations$imp4[,numeric_pos] +
                                                    full_impute$imputations$imp5[,numeric_pos] +
                                                    full_impute$imputations$imp6[,numeric_pos] +
                                                    full_impute$imputations$imp7[,numeric_pos] +
                                                    full_impute$imputations$imp8[,numeric_pos] +
                                                    full_impute$imputations$imp9[,numeric_pos] +
                                                    full_impute$imputations$imp10[,numeric_pos] +
                                                    full_impute$imputations$imp11[,numeric_pos] +
                                                    full_impute$imputations$imp12[,numeric_pos] +
                                                    full_impute$imputations$imp13[,numeric_pos] +
                                                    full_impute$imputations$imp14[,numeric_pos] +
                                                    full_impute$imputations$imp15[,numeric_pos])/15)

# saveRDS(full_imputed_averaged, "output/full_imputed_averaged_20230920.rds")
# full_imputed_averaged <- readRDS("output/full_imputed_averaged_20230920.rds")


#### 5. Make spatial weights ####
#' First create the right data structures: 
#' -- dyadic trade and cross-sectional targets
#' Then create the spatial weights for each matrix and type of target

##' Trade data are directed 
##' From `_o` origin
##' To `_d` destination
##' --> Reshape to get dyad level trade as a share of total trade

## Calculate each country's trade in dyad as a share of their total trade
# Within each of these sectors

hold_dyadic <- full_imputed_averaged %>% 
  select(iso3_o, iso3_d, year, 
         tradeflow_baci, trade_eite, trade_green) %>% 
  filter(iso3_o!=1,
         iso3_d!=1) %>% 
  # Summarize dyadic flows in the five year period
  filter(year>2015, year < 2020) %>% 
  # filter(iso3_o %in% c("CAN", "DNK", "GBR") & 
  #          iso3_d %in% c("CAN", "DNK", "GBR")) %>% 
  group_by(iso3_o, iso3_d) %>%
  summarize(tradeflow_baci = mean(tradeflow_baci), # <- these are exports from _o to _d
            trade_eite = mean(trade_eite),
            trade_green = mean(trade_green)) %>% 
  # For each dyad, get their total trade
  rowwise() %>% 
  mutate(dyad_id = paste(min(iso3_d, iso3_o),
                         max(iso3_d, iso3_o),
                         sep = "_")) %>% 
  group_by(dyad_id) %>% 
  mutate(dyadic_tradeflow = sum(tradeflow_baci), # Sum of exports/imports (undirected) in that dyad
         dyadic_trade_eite = sum(trade_eite),
         dyadic_trade_green = sum(trade_green)) %>% 
  group_by(iso3_o) %>% 
  mutate(exports_o_tradeflow = sum(tradeflow_baci), # Sum of _o's exports
         exports_o_eite = sum(trade_eite),
         exports_o_green = sum(trade_green)) %>% 
  group_by(iso3_d) %>% 
  mutate(imports_d_tradeflow = sum(tradeflow_baci), # Sum of _d's imports; this is at the wrong level, re-shape below
         imports_d_eite = sum(trade_eite),
         imports_d_green = sum(trade_green)) %>% 
  arrange(dyad_id) %>% 
  ungroup() %>% 
  as.data.frame()

imports_iso3_o <- hold_dyadic %>% 
  group_by(iso3_d) %>% # Yes, should be _d; this is for re-shaping the above
  # Get country-level total imports in each category
  distinct(imports_d_tradeflow, 
           imports_d_eite,
           imports_d_green) %>% # Yes, should be _d
  rename(iso3_o = iso3_d, # Then, rename and merge back in
         imports_o_tradeflow = imports_d_tradeflow,
         imports_o_eite = imports_d_eite,
         imports_o_green = imports_d_green) %>% 
  ungroup() %>% 
  as.data.frame()

# Merge back together
trade_long <- full_join(hold_dyadic %>% 
                          select(-imports_d_tradeflow, -imports_d_eite, -imports_d_green), 
                        imports_iso3_o, by = "iso3_o") %>% 
  # Get each country's total trade
  mutate(total_trade_o_tradeflow = exports_o_tradeflow + imports_o_tradeflow,
         total_trade_o_eite = exports_o_eite + imports_o_eite,
         total_trade_o_green = exports_o_green + imports_o_green) %>% 
  # Get each country's trade in dyad as share of their total trade
  mutate(share_dyadic_o_tradeflow = dyadic_tradeflow/total_trade_o_tradeflow,
         share_dyadic_o_eite = dyadic_trade_eite/total_trade_o_eite,
         share_dyadic_o_green = dyadic_trade_green/total_trade_o_green) %>% 
  select(iso3_o, iso3_d, 
         # starts_with("dyadic"), 
         starts_with("share")) 



### Now re-organize the targets data to merge for spatial weights

#' -- Glossary
#' variables with `safe_` prefix are not imputed 
#' variables that are not renamed are Amelia imputed in the country_level impute

partners_targets_cs <- full_imputed_averaged %>% 
  select(iso3_o, iso3_d, year, income_d,
         # should all be _d's targets --- 6? + safe_
         # -- Percentage
         ndc1_percentage_average_d,
         safe_ndc1_percentage_average_d,
         # -- Laws
         climate_laws_d,
         climate_laws_new20142019_d, 
         ## -- standardized, z-
         z_ndc1_percentage_average_d,
         z_climate_laws_d,
         z_climate_laws_new20142019_d) %>% 
  filter(iso3_o!=1,
         iso3_d!=1) %>% 
  filter(year %in% c(2015:2019)) %>% 
  group_by(iso3_d) %>%
  mutate(climate_laws_d = mean(climate_laws_d)) %>% 
  group_by(income_d) %>% 
  mutate(z_ndc1_percentage_average_safe_d = scale(safe_ndc1_percentage_average_d)) %>%  
  ungroup() %>% 
  mutate(z_ndc1_percentage_average_safe_d = as.numeric(z_ndc1_percentage_average_safe_d)) %>% 
  select(-year, -income_d) %>% 
  distinct() %>% 
  select(-iso3_o) %>% 
  group_by(iso3_d) %>%
  summarise_at(vars(ndc1_percentage_average_d:z_ndc1_percentage_average_safe_d),
               ~mean(.))
  

### Spatial weights 

##' Merge the `trade_long` object that has the dyadic trade shares
##' To the `partners_targets_cs` object that has the targets data

## For all trade
spatial_weights_tradeflow <- left_join(trade_long, partners_targets_cs, by = "iso3_d") %>% 
  # Spatial weight in two steps
  rowwise() %>% 
  mutate(
    # -- Percentage targets
    multiply_percentage = ndc1_percentage_average_d * share_dyadic_o_tradeflow,
    safe_multiply_percentage = safe_ndc1_percentage_average_d * share_dyadic_o_tradeflow,
    multiply_z_percentage = z_ndc1_percentage_average_d * share_dyadic_o_tradeflow,
    safe_multiply_z_percentage = z_ndc1_percentage_average_safe_d * share_dyadic_o_tradeflow,
    # -- Laws
    multiply_laws = climate_laws_d * share_dyadic_o_tradeflow,
    multiply_laws_new = climate_laws_new20142019_d * share_dyadic_o_tradeflow,
    multiply_z_laws = z_climate_laws_d * share_dyadic_o_tradeflow,
    multiply_z_laws_new = z_climate_laws_new20142019_d * share_dyadic_o_tradeflow) %>% 
  group_by(iso3_o) %>% 
  summarize(
    # -- Percentage targets
    tradeflow_percentage = sum(multiply_percentage, na.rm = T),
    tradeflow_safe_percentage = sum(safe_multiply_percentage, na.rm = T),
    tradeflow_z_percentage = sum(multiply_z_percentage, na.rm = T),
    tradeflow_z_safe_percentage = sum(safe_multiply_z_percentage, na.rm = T),
    # -- Laws
    tradeflow_laws = sum(multiply_laws, na.rm = T),
    tradeflow_laws_new = sum(multiply_laws_new, na.rm = T),
    tradeflow_z_laws = sum(multiply_z_laws, na.rm = T),
    tradeflow_z_laws_new = sum(multiply_z_laws_new, na.rm = T)) %>% 
  ungroup()


## For EITE trade
spatial_weights_eite <- left_join(trade_long, partners_targets_cs, by = "iso3_d") %>% 
  # Spatial weight in two steps
  rowwise() %>% 
  mutate(
    # -- Percentage targets
    multiply_percentage = ndc1_percentage_average_d * share_dyadic_o_eite,
    safe_multiply_percentage = safe_ndc1_percentage_average_d * share_dyadic_o_eite,
    multiply_z_percentage = z_ndc1_percentage_average_d * share_dyadic_o_eite,
    safe_multiply_z_percentage = z_ndc1_percentage_average_safe_d * share_dyadic_o_eite,
    # -- Laws
    multiply_laws = climate_laws_d * share_dyadic_o_eite,
    multiply_laws_new = climate_laws_new20142019_d * share_dyadic_o_eite,
    multiply_z_laws = z_climate_laws_d * share_dyadic_o_eite,
    multiply_z_laws_new = z_climate_laws_new20142019_d * share_dyadic_o_eite) %>% 
  group_by(iso3_o) %>% 
  summarize(
    # -- Percentage targets
    eite_percentage = sum(multiply_percentage, na.rm = T),
    eite_safe_percentage = sum(safe_multiply_percentage, na.rm = T),
    eite_z_percentage = sum(multiply_z_percentage, na.rm = T),
    eite_z_safe_percentage = sum(safe_multiply_z_percentage, na.rm = T),
    # -- Laws
    eite_laws = sum(multiply_laws, na.rm = T),
    eite_laws_new = sum(multiply_laws_new, na.rm = T),
    eite_z_laws = sum(multiply_z_laws, na.rm = T),
    eite_z_laws_new = sum(multiply_z_laws_new, na.rm = T)) %>% 
  ungroup() 

## For Green trade
spatial_weights_green <- left_join(trade_long, partners_targets_cs, by = "iso3_d") %>% 
  # Spatial weight in two steps
  rowwise() %>% 
  mutate(
    # -- Percentage targets
    multiply_percentage = ndc1_percentage_average_d * share_dyadic_o_green,
    safe_multiply_percentage = safe_ndc1_percentage_average_d * share_dyadic_o_green,
    multiply_z_percentage = z_ndc1_percentage_average_d * share_dyadic_o_green,
    safe_multiply_z_percentage = z_ndc1_percentage_average_safe_d * share_dyadic_o_green,
    # -- Laws
    multiply_laws = climate_laws_d * share_dyadic_o_green,
    multiply_laws_new = climate_laws_new20142019_d * share_dyadic_o_green,
    multiply_z_laws = z_climate_laws_d * share_dyadic_o_green,
    multiply_z_laws_new = z_climate_laws_new20142019_d * share_dyadic_o_green) %>% 
  group_by(iso3_o) %>% 
  summarize(
    # -- Percentage targets
    green_percentage = sum(multiply_percentage, na.rm = T),
    green_safe_percentage = sum(safe_multiply_percentage, na.rm = T),
    green_z_percentage = sum(multiply_z_percentage, na.rm = T),
    green_z_safe_percentage = sum(safe_multiply_z_percentage, na.rm = T),
    # -- Laws
    green_laws = sum(multiply_laws, na.rm = T),
    green_laws_new = sum(multiply_laws_new, na.rm = T),
    green_z_laws = sum(multiply_z_laws, na.rm = T),
    green_z_laws_new = sum(multiply_z_laws_new, na.rm = T)) %>% 
  ungroup() 

## Load in IO dyadic data to make IO spatial weights
io_row_standardized <- full_imputed_averaged %>% 
  select(iso3_o, iso3_d, sum_common_io) %>% 
  group_by(iso3_o, iso3_d) %>% 
  summarize(sum_common_io = mean(sum_common_io)) %>% 
  full_join(., read_csv("input/io_membership_cs.csv"),
            by = c("iso3_o" = "iso3c")) %>% 
  rowwise() %>% 
  summarize(iso3_o, iso3_d, 
            share_common_io = sum_common_io/state_io_memberships) %>% 
  # Row-standardize
  group_by(iso3_o) %>% 
  mutate(row_sum = sum(share_common_io)) %>% 
  summarize(iso3_o, iso3_d, 
            share_common_io_standardized = share_common_io/row_sum)
  
## For IO connectivity
spatial_weights_io <- left_join(io_row_standardized, 
                                 partners_targets_cs, by = "iso3_d") %>% 
  # Spatial weight in two steps
  rowwise() %>% 
  mutate(
    # -- Percentage targets
    multiply_percentage = ndc1_percentage_average_d * share_common_io_standardized,
    safe_multiply_percentage = safe_ndc1_percentage_average_d * share_common_io_standardized,
    multiply_z_percentage = z_ndc1_percentage_average_d * share_common_io_standardized,
    safe_multiply_z_percentage = z_ndc1_percentage_average_safe_d * share_common_io_standardized,
    # -- Laws
    multiply_laws = climate_laws_d * share_common_io_standardized,
    multiply_laws_new = climate_laws_new20142019_d * share_common_io_standardized,
    multiply_z_laws = z_climate_laws_d * share_common_io_standardized,
    multiply_z_laws_new = z_climate_laws_new20142019_d * share_common_io_standardized) %>% 
  group_by(iso3_o) %>% 
  summarize(
    # -- Percentage targets
    io_percentage = sum(multiply_percentage, na.rm = T),
    io_safe_percentage = sum(safe_multiply_percentage, na.rm = T),
    io_z_percentage = sum(multiply_z_percentage, na.rm = T),
    io_z_safe_percentage = sum(safe_multiply_z_percentage, na.rm = T),
    # -- Laws
    io_laws = sum(multiply_laws, na.rm = T),
    io_laws_new = sum(multiply_laws_new, na.rm = T),
    io_z_laws = sum(multiply_z_laws, na.rm = T),
    io_z_laws_new = sum(multiply_z_laws_new, na.rm = T)) %>% 
  ungroup() 



## Simple data.frame with targets

targets_analysis <- country_level_average %>% 
  filter(year == 2016) %>% 
  # select(iso3c,
  #        paris_target_safe = safe_ndc1_percentage_average,
  #        paris_target_impute = ndc1_percentage_average,
  #        glasgow_target_impute = ndc2_percentage_average,
  #        glasgow_target_safe = safe_ndc2_percentage_average)
  mutate(trade_openness = log(exp(imports) + exp(exports))) %>%               
  select(iso3c, region, income, 
         gdp_capita, trade_openness, renewables_share,
         industry, fossil_rents, vdem,
         paris_target_safe = safe_ndc1_percentage_average,
         paris_target_impute = ndc1_percentage_average,
         glasgow_target_impute = ndc2_percentage_average,
         glasgow_target_safe = safe_ndc2_percentage_average)

## Define EU countries
eu_iso <- c("AUT", "BEL", "BGR", "HRV", "CYP", 
            "CZE", "DNK", "EST", "FIN", "FRA", 
            "DEU", "GRC", "HUN", "IRL", "ITA", 
            "LVA", "LTU", "LUX", "MLT", "NLD", 
            "POL", "PRT", "ROU", "SVK", "SVN", 
            "ESP", "SWE")

trade_exposure <- trade_long %>% 
  mutate(eu = ifelse(iso3_d %in% eu_iso, 1, 0)) %>% 
  mutate(iso3_d = ifelse(eu == 1, "EU", iso3_d)) %>% 
  filter(iso3_d %in% c("USA", "CHN", "EU")) %>% 
  select(iso3_o, iso3_d, share_dyadic_o_tradeflow) %>% 
  group_by(iso3_o, iso3_d) %>% 
  summarize(trade_exposure = sum(share_dyadic_o_tradeflow)) %>% 
  ungroup() %>% 
  pivot_wider(names_from = iso3_d, values_from = trade_exposure, names_prefix = "trade_exposure_") %>% 
  rename(iso3c = iso3_o)
  
### Cosine similarity in trade

## Exports only total trade matrix 
tradeflow_matrix <- full_imputed_averaged %>% 
  select(year, iso3_o, iso3_d, tradeflow_baci) %>% 
  # Summarize dyadic flows in the five year period
  filter(year>2015, year < 2020) %>% 
  group_by(iso3_o, iso3_d) %>% 
  summarize(tradeflow_baci_exports = mean(tradeflow_baci)) %>% 
  # Get country's total exports
  group_by(iso3_o) %>% 
  mutate(total_exports = sum(tradeflow_baci_exports)) %>% 
  rowwise() %>% 
  # Row-standardize 
  summarize(iso3_o, iso3_d,
            tradeflow_exports_share = 
              tradeflow_baci_exports/total_exports) %>% 
  ungroup() %>% 
  # Pivot wider to make into a matrix
  pivot_wider(names_from = "iso3_d", 
              values_from = "tradeflow_exports_share", 
              values_fill = 0) %>% 
  relocate("AFG", .after = "iso3_o") %>% 
  column_to_rownames("iso3_o") %>% 
  as.matrix() 

# Competition matrix, cosine similarity
cosine_tradeflow_matrix <- cosine(tradeflow_matrix)

## Put cosine() matrices back into natural units by standardizing them again
cosine_tradeflow_matrix_std <- cosine_tradeflow_matrix %>% 
  as.data.frame() %>% 
  mutate(iso3_o = colnames(.)) %>% 
  pivot_longer(names_to = "iso3_d", values_to = "cosine_similarity", AFG:ZWE) %>% 
  mutate(cosine_similarity = ifelse(iso3_o == iso3_d, 0, cosine_similarity)) %>% 
  group_by(iso3_o) %>% 
  mutate(group_sum = sum(cosine_similarity),
         cosine_similarity_standardized = cosine_similarity/group_sum) %>% 
  ungroup() %>%
  select(iso3_o, iso3_d, cosine_similarity_standardized) %>% 
  pivot_wider(names_from = "iso3_d", 
              values_from = "cosine_similarity_standardized") %>% 
  column_to_rownames("iso3_o") %>% 
  as.matrix()

competition_tradeflow_percentage <- as.vector(
  partners_targets_cs$ndc1_percentage_average_d %*% cosine_tradeflow_matrix_std)

### For EITE 

# Exports only EITE trade matrix 
eite_matrix <- full_imputed_averaged %>% 
  select(year, iso3_o, iso3_d, trade_eite) %>% 
  # Summarize dyadic flows in the five year period
  filter(year>2015, year < 2020) %>% 
  group_by(iso3_o, iso3_d) %>% 
  summarize(eite_exports = mean(trade_eite)) %>% 
  ungroup() %>% 
  # Get country's total EITE exports
  group_by(iso3_o) %>% 
  mutate(total_eite_exports = sum(eite_exports)) %>% 
  rowwise() %>% 
  summarize(iso3_o, iso3_d,
            eite_exports_standardized = 
              eite_exports/total_eite_exports) %>% 
  ungroup() %>% 
  mutate(eite_exports_standardized = replace(eite_exports_standardized,
                                             is.na(eite_exports_standardized), 0)) %>% 
  pivot_wider(names_from = "iso3_d", 
              values_from = "eite_exports_standardized",
              values_fill = 0) %>% 
  relocate("AFG", .after = "iso3_o") %>% 
  column_to_rownames("iso3_o") %>% 
  as.matrix() 

# Competition matrix, cosine similarity
cosine_eite_matrix <- cosine(eite_matrix)

## Put cosine() matrices back into natural units by standardizing them again
cosine_eite_matrix_std <- cosine_eite_matrix %>% 
  as.data.frame() %>% 
  mutate(iso3_o = colnames(.)) %>% 
  pivot_longer(names_to = "iso3_d", values_to = "cosine_similarity", AFG:ZWE) %>% 
  mutate(cosine_similarity = ifelse(iso3_o == iso3_d, 0, cosine_similarity)) %>% 
  group_by(iso3_o) %>% 
  mutate(group_sum = sum(cosine_similarity),
         cosine_similarity_standardized = cosine_similarity/group_sum) %>% 
  ungroup() %>%
  select(iso3_o, iso3_d, cosine_similarity_standardized) %>% 
  pivot_wider(names_from = "iso3_d", 
              values_from = "cosine_similarity_standardized") %>% 
  column_to_rownames("iso3_o") %>% 
  as.matrix()

competition_eite_percentage <- as.vector(
  partners_targets_cs$ndc1_percentage_average_d %*% cosine_eite_matrix_std)


## Use alternative GHG series from PRIMAP
# -- Lots of code, but this is basically re-creating what's above

# Load in PRIMAP GHG data
primap <- read_csv("input/primap_v231.csv") %>% 
  filter(entity == "KYOTOGHG (AR4GWP100)") %>% # Aggregate using Kyoto GWP rules
  filter(`scenario (PRIMAP-hist)` == "HISTTP") %>% # Prioritize third-party emissions data over country data
  filter(`category (IPCC2006_PRIMAP)` == "M.0.EL") %>%  # Include all sectors
  pivot_longer(names_to = "year", values_to = "ghg_primap", `1980`:`2019`) %>% 
  select(iso3c = `area (ISO3)`, year, ghg_primap) %>% 
  filter(year > 1989) %>% 
  filter(!iso3c %in% c("ANNEXI", "AOSIS", "BASIC", "EARTH",
                       "EU28", "EU27BX", "NONANNEXI", "UMBRELLA")) %>% 
  filter(iso3c %in% core_iso3c) %>% 
  mutate(year = as.numeric(year)) %>% 
  mutate(ghg_primap = ghg_primap/1000) # into MtCO2

ndc2015_df <- read_csv("input/ndc_2015_reconciled.csv") %>% 
  filter(iso3c %in% core_iso3c) %>% 
  arrange(iso3c) %>% 
  # Make proper date variables
  mutate(date = lubridate::as_date("2015-12-12")) %>% 
  relocate(date, .after = ("country")) %>% 
  # Remove 'MtCO2' from values
  mutate_at(vars(c(cw_ndc1_ghgtarget_scenario_referencelevel,
                   cw_ndc1_ghgtarget_fixedlevel_goal,
                   cw_ndc1_ghgtarget_fixedlevel_goal_stretch)), ~parse_number(.)) %>% 
  mutate(cw_ndc1_ghgtarget_type = 
           ifelse(cw_ndc1_ghgtarget_type == "Base year", 
                  "Base Year", 
                  ifelse(cw_ndc1_ghgtarget_type == "Fixed level", 
                         "Fixed Level", cw_ndc1_ghgtarget_type)))

# Merge 2015 targets with GHG time series
primap_targets2015 <- full_join(primap, ndc2015_df, by = "iso3c") %>% 
  arrange(iso3c, year) %>% 
  # Get emissions in base year and GHG 2010 as own variables
  mutate(ndc1_baseyear_emissions = ifelse((cw_ndc1_ghgtarget_type == "Base Year" &
                                             year == cw_ndc1_ghgtarget_referenceyear),
                                          ghg_primap, NA),
         ghg_primap_2000 = ifelse(year == 2000, ghg_primap, NA),
         ghg_primap_2010 = ifelse(year == 2010, ghg_primap, NA),
         ghg_primap_2015 = ifelse(year == 2015, ghg_primap, NA)) %>% 
  group_by(iso3c) %>% 
  mutate(ndc1_baseyear_emissions = my.max(ndc1_baseyear_emissions),
         ghg_primap_2000 = my.max(ghg_primap_2000),
         ghg_primap_2010 = my.max(ghg_primap_2010),
         ghg_primap_2015 = my.max(ghg_primap_2015)) %>% 
  ungroup() %>% 
  # Get absolute emissions levels in targets
  mutate(ndc1_absolute_uncond = 
           ifelse(cw_ndc1_ghgtarget_type == "Fixed Level", cw_ndc1_ghgtarget_fixedlevel_goal, # For fixed level targets
                  ifelse(cw_ndc1_ghgtarget_type == "Scenario", # For scenario targets
                         cw_ndc1_ghgtarget_scenario_referencelevel*(1-cw_ndc1_ghgtarget_scenario_goal/100),
                         ifelse(cw_ndc1_ghgtarget_type == "Base Year", # For base year targets
                                ndc1_baseyear_emissions*(1-cw_ndc1_ghgtarget_baseyear_goal/100), NA)))) %>% 
  mutate(ndc1_absolute_cond = 
           ifelse(cw_ndc1_ghgtarget_type == "Fixed Level", cw_ndc1_ghgtarget_fixedlevel_conditional, # For fixed level targets
                  ifelse(cw_ndc1_ghgtarget_type == "Scenario", # For scenario targets
                         cw_ndc1_ghgtarget_scenario_referencelevel*(1-cw_ndc1_ghgtarget_scenario_goal_conditional/100),
                         ifelse(cw_ndc1_ghgtarget_type == "Base Year", # For base year targets
                                ndc1_baseyear_emissions*(1-cw_ndc1_ghgtarget_baseyear_goal_conditional/100), NA)))) %>% 
  # Get emissions levels in targets as percentage of 2010 emissions (same benchmark as PRIMAP)
  mutate(ndc1_percentage_uncond = ndc1_absolute_uncond/ghg_primap_2010,
         ndc1_percentage_cond = ndc1_absolute_cond/ghg_primap_2010) %>% 
  # Get an average across both NDC targets (unconditional and conditional)
  rowwise() %>% 
  mutate(ndc1_percentage_average = mean(c(
    ndc1_percentage_uncond, ndc1_percentage_cond), na.rm=T)) %>% 
  # Re-scale the targets so that higher values are cuts, zero is constant
  mutate_at(vars(ndc1_percentage_average),
            ~(1-.)*100) %>% 
  # Drop repeated observations
  select(-year, -ghg_primap) %>% 
  distinct() %>% 
  filter(iso3c %in% core_iso3c) %>% 
  # Keep only the core variables
  select(iso3c, ndc1_percentage_average) %>% 
  mutate_at(vars(ndc1_percentage_average), ~replace(., is.na(.), NA)) %>% 
  as.data.frame()


### Load in 2021 NDC targets data

## Load in SR 2021 NDC data
ndc2021 <- read_excel("input/ndc_sr_2021.xlsx", sheet = "2021 Dataset") %>% 
  arrange(iso3c) %>% 
  # Make proper date variables
  mutate(date = lubridate::as_date(submission_date_yy_mm_dd)) %>% 
  relocate(date, .after = ("country")) %>% 
  select(-submission_date_yy_mm_dd) %>% 
  # Remove 'MtCO2' from values
  mutate_at(vars(c(cw_ndc2_ghgtarget_scenario_referencelevel,
                   cw_ndc2_ghgtarget_fixedlevel_goal,
                   cw_ndc2_ghgtarget_fixedlevel_goal_stretch,
                   cw_ndc2_ghgtarget_fixedlevel_conditional)), ~parse_number(.))

# Merge 2021 targets with GHG time series
primap_targets2021 <- full_join(primap, ndc2021, by = "iso3c") %>%
  arrange(iso3c, year) %>% 
  # Get emissions in base year and GHG 2010 as own variables
  mutate(ndc2_baseyear_emissions = ifelse((cw_ndc2_ghgtarget_type == "Base Year" &
                                             year == cw_ndc2_ghgtarget_referenceyear),
                                          ghg_primap, NA),
         ghg_primap_2000 = ifelse(year == 2000, ghg_primap, NA),
         ghg_primap_2010 = ifelse(year == 2010, ghg_primap, NA),
         ghg_primap_2015 = ifelse(year == 2015, ghg_primap, NA)) %>% 
  group_by(iso3c) %>% 
  mutate(ndc2_baseyear_emissions = my.max(ndc2_baseyear_emissions),
         ghg_primap_2000 = my.max(ghg_primap_2000),
         ghg_primap_2010 = my.max(ghg_primap_2010),
         ghg_primap_2015 = my.max(ghg_primap_2015)) %>% 
  ungroup() %>% 
  # Get absolute emissions levels in targets
  ungroup() %>% 
  mutate(ndc2_absolute_uncond = 
           ifelse(cw_ndc2_ghgtarget_type == "Fixed Level", cw_ndc2_ghgtarget_fixedlevel_goal, # For fixed level targets
                  ifelse(cw_ndc2_ghgtarget_type == "Scenario", # For scenario
                         cw_ndc2_ghgtarget_scenario_referencelevel*(1-cw_ndc2_ghgtarget_scenario_unconditional_goal/100),
                         ifelse(cw_ndc2_ghgtarget_type == "Base Year", # For base year
                                ndc2_baseyear_emissions*(1-cw_ndc2_ghgtarget_baseyear_goal/100), NA)))) %>% 
  mutate(ndc2_absolute_cond = 
           ifelse(cw_ndc2_ghgtarget_type == "Fixed Level", cw_ndc2_ghgtarget_fixedlevel_conditional,
                  ifelse(cw_ndc2_ghgtarget_type == "Scenario",
                         cw_ndc2_ghgtarget_scenario_referencelevel*(1-cw_ndc2_ghgtarget_scenario_goal_conditional/100),
                         ifelse(cw_ndc2_ghgtarget_type == "Base Year",
                                ndc2_baseyear_emissions*(1-cw_ndc2_ghgtarget_baseyear_goal_conditional/100), NA)))) %>% 
  # Get emissions levels in targets as percentage of 2010 emissions (same benchmark as PRIMAP)
  mutate(ndc2_percentage_uncond = ndc2_absolute_uncond/ghg_primap_2010,
         ndc2_percentage_cond = ndc2_absolute_cond/ghg_primap_2010) %>% 
  # Get an average across both NDC targets (unconditional and conditional)
  rowwise() %>% 
  mutate(ndc2_percentage_average = mean(c(
    ndc2_percentage_uncond, ndc2_percentage_cond), na.rm=T)) %>% 
  # Re-scale the targets so that higher values are cuts, zero is constant
  mutate_at(vars(ndc2_percentage_average),
            ~(1-.)*100) %>% 
  ungroup() %>% 
  # Drop repeated observations
  select(-year, -ghg_primap) %>% 
  distinct() %>% 
  # Keep only "final" NDC 
  group_by(iso3c) %>% 
  mutate(entry = row_number(),
         entry_max = my.max(entry)) %>% 
  ungroup() %>% 
  filter(entry==entry_max) %>% 
  filter(iso3c %in% core_iso3c) %>%
  # Keep only the core variables
  select(iso3c, ndc2_percentage_average) %>% 
  mutate_at(vars(ndc2_percentage_average), ~replace(., is.na(.), NA)) %>% 
  as.data.frame()

## Get PRIMAP targets only
primap_targets <- full_join(primap_targets2015, primap_targets2021, by = 'iso3c') %>% 
  select(iso3c, paris_target_primap = ndc1_percentage_average, 
         glasgow_target_primap = ndc2_percentage_average) %>% 
  mutate_at(vars(paris_target_primap:glasgow_target_primap),
            ~Winsorize(., probs = c(0.05, 0.95), na.rm = T)) 


#### 6. Make analysis dataframe with targets, covariates, spatial weights ####

data_for_models <- full_join(targets_analysis, spatial_weights_tradeflow, by = c("iso3c" = "iso3_o")) %>% 
  full_join(., spatial_weights_eite, by = c("iso3c" = "iso3_o")) %>% 
  full_join(., spatial_weights_green, by = c("iso3c" = "iso3_o")) %>% 
  full_join(., spatial_weights_io, by = c("iso3c" = "iso3_o")) %>% 
  cbind(., competition_tradeflow_percentage, competition_eite_percentage) %>% 
  full_join(., trade_exposure, by = c("iso3c")) %>% 
  full_join(., primap_targets, by = c("iso3c")) %>% 
  mutate_at(vars(paris_target_safe:glasgow_target_primap),
            ~Winsorize(., probs = c(0.05, 0.95), na.rm = T)) 
  
saveRDS(data_for_models, file = "output/data_for_models20230920.Rds") # IO, properly

